{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import lightgbm as lgb\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.model_selection import KFold\n",
    "import datetime\n",
    "import gc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = pd.read_csv('../Large_output/train_df_new.csv')\n",
    "train_df['meter_reading'] = np.expm1(train_df[\"meter_reading\"])\n",
    "train_df.loc[(train_df['site_id']==0) & (train_df['meter']==0),'meter_reading']=\\\n",
    "train_df.loc[(train_df['site_id']==0) & (train_df['meter']==0),'meter_reading'].mul(0.2931)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "target = np.log1p(train_df[\"meter_reading\"])\n",
    "features = train_df[['building_id',\\\n",
    " 'site_id',\\\n",
    " 'primary_use',\\\n",
    " 'meter',\\\n",
    " 'dayofweek',\\\n",
    " 'square_feet',\\\n",
    " 'year_built',\\\n",
    " 'floor_count',\\\n",
    " 'air_temperature',\\\n",
    " 'cloud_coverage',\\\n",
    " 'dew_temperature',\\\n",
    " 'precip_depth_1_hr',\\\n",
    " 'sea_level_pressure',\\\n",
    " 'wind_direction',\\\n",
    " 'wind_speed',\\\n",
    " 'relative_humidity',\\\n",
    " 'feels_like',\\\n",
    " 'hour']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "categorical_features = ['building_id', 'site_id', 'primary_use', 'meter','dayofweek']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pandas.api.types import is_datetime64_any_dtype as is_datetime\n",
    "from pandas.api.types import is_categorical_dtype\n",
    "\n",
    "def reduce_mem_usage(df, use_float16=False):\n",
    "    \"\"\"\n",
    "    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        \n",
    "    \"\"\"\n",
    "    \n",
    "    start_mem = df.memory_usage().sum() / 1024**2\n",
    "    print(\"Memory usage of dataframe is {:.2f} MB\".format(start_mem))\n",
    "    \n",
    "    for col in df.columns:\n",
    "        if is_datetime(df[col]) or is_categorical_dtype(df[col]):\n",
    "            continue\n",
    "        col_type = df[col].dtype\n",
    "        \n",
    "        if col_type != object:\n",
    "            c_min = df[col].min()\n",
    "            c_max = df[col].max()\n",
    "            if str(col_type)[:3] == \"int\":\n",
    "                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n",
    "                    df[col] = df[col].astype(np.int8)\n",
    "                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n",
    "                    df[col] = df[col].astype(np.int16)\n",
    "                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n",
    "                    df[col] = df[col].astype(np.int32)\n",
    "                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n",
    "                    df[col] = df[col].astype(np.int64)  \n",
    "            else:\n",
    "                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n",
    "                    df[col] = df[col].astype(np.float16)\n",
    "                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n",
    "                    df[col] = df[col].astype(np.float32)\n",
    "                else:\n",
    "                    df[col] = df[col].astype(np.float64)\n",
    "        else:\n",
    "            df[col] = df[col].astype(\"category\")\n",
    "\n",
    "    end_mem = df.memory_usage().sum() / 1024**2\n",
    "    print(\"Memory usage after optimization is: {:.2f} MB\".format(end_mem))\n",
    "    print(\"Decreased by {:.1f}%\".format(100 * (start_mem - end_mem) / start_mem))\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory usage of dataframe is 2640.82 MB\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/shuozhang/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:24: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "/Users/shuozhang/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:22: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "/Users/shuozhang/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:31: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory usage after optimization is: 568.51 MB\n",
      "Decreased by 78.5%\n",
      "Memory usage of dataframe is 5726.29 MB\n",
      "Memory usage after optimization is: 1232.74 MB\n",
      "Decreased by 78.5%\n"
     ]
    }
   ],
   "source": [
    "features=reduce_mem_usage(features,use_float16=True)\n",
    "X_test=reduce_mem_usage(X_test,use_float16=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory usage of dataframe is 2640.82 MB\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/local/home/ningzesun/.local/lib/python3.6/site-packages/ipykernel_launcher.py:24: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "/local/home/ningzesun/.local/lib/python3.6/site-packages/ipykernel_launcher.py:22: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "/local/home/ningzesun/.local/lib/python3.6/site-packages/ipykernel_launcher.py:31: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory usage after optimization is: 568.51 MB\n",
      "Decreased by 78.5%\n"
     ]
    }
   ],
   "source": [
    "import xgboost as xgb\n",
    "from sklearn.metrics import mean_squared_error\n",
    "features=reduce_mem_usage(features,use_float16=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/local/home/ningzesun/.local/lib/python3.6/site-packages/xgboost/core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
      "  if getattr(data, 'base', None) is not None and \\\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0]\ttrain-rmse:4.09856\tvalid-rmse:4.11155\n",
      "Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.\n",
      "\n",
      "Will train until valid-rmse hasn't improved in 200 rounds.\n",
      "[50]\ttrain-rmse:0.999885\tvalid-rmse:1.00408\n",
      "[100]\ttrain-rmse:0.820916\tvalid-rmse:0.833524\n",
      "[150]\ttrain-rmse:0.769534\tvalid-rmse:0.798885\n",
      "[200]\ttrain-rmse:0.737511\tvalid-rmse:0.779073\n",
      "[250]\ttrain-rmse:0.71614\tvalid-rmse:0.767724\n",
      "[300]\ttrain-rmse:0.697023\tvalid-rmse:0.757964\n",
      "[350]\ttrain-rmse:0.683936\tvalid-rmse:0.753678\n",
      "[400]\ttrain-rmse:0.673517\tvalid-rmse:0.750265\n",
      "[450]\ttrain-rmse:0.664385\tvalid-rmse:0.74754\n",
      "[500]\ttrain-rmse:0.657322\tvalid-rmse:0.746102\n",
      "[550]\ttrain-rmse:0.650531\tvalid-rmse:0.744552\n",
      "[600]\ttrain-rmse:0.645264\tvalid-rmse:0.744046\n",
      "[650]\ttrain-rmse:0.639814\tvalid-rmse:0.743107\n",
      "[700]\ttrain-rmse:0.634518\tvalid-rmse:0.742406\n",
      "[750]\ttrain-rmse:0.630082\tvalid-rmse:0.741935\n",
      "[800]\ttrain-rmse:0.626031\tvalid-rmse:0.74203\n",
      "[850]\ttrain-rmse:0.621977\tvalid-rmse:0.74196\n",
      "[900]\ttrain-rmse:0.618614\tvalid-rmse:0.741943\n",
      "[950]\ttrain-rmse:0.615313\tvalid-rmse:0.741979\n",
      "[1000]\ttrain-rmse:0.612479\tvalid-rmse:0.742168\n",
      "Stopping. Best iteration:\n",
      "[824]\ttrain-rmse:0.624159\tvalid-rmse:0.741887\n",
      "\n",
      "Fold 1 | rmse: 0.7418874607853603\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/local/home/ningzesun/.local/lib/python3.6/site-packages/xgboost/core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
      "  if getattr(data, 'base', None) is not None and \\\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0]\ttrain-rmse:4.10666\tvalid-rmse:4.07735\n",
      "Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.\n",
      "\n",
      "Will train until valid-rmse hasn't improved in 200 rounds.\n",
      "[50]\ttrain-rmse:0.986558\tvalid-rmse:1.06456\n",
      "[100]\ttrain-rmse:0.80048\tvalid-rmse:0.92138\n",
      "[150]\ttrain-rmse:0.752451\tvalid-rmse:0.892215\n",
      "[200]\ttrain-rmse:0.722688\tvalid-rmse:0.875356\n",
      "[250]\ttrain-rmse:0.699638\tvalid-rmse:0.863528\n",
      "[300]\ttrain-rmse:0.683237\tvalid-rmse:0.856248\n",
      "[350]\ttrain-rmse:0.669486\tvalid-rmse:0.850807\n",
      "[400]\ttrain-rmse:0.659716\tvalid-rmse:0.84763\n",
      "[450]\ttrain-rmse:0.651771\tvalid-rmse:0.845365\n",
      "[500]\ttrain-rmse:0.644381\tvalid-rmse:0.843647\n",
      "[550]\ttrain-rmse:0.637901\tvalid-rmse:0.842337\n",
      "[600]\ttrain-rmse:0.632808\tvalid-rmse:0.841816\n",
      "[650]\ttrain-rmse:0.627555\tvalid-rmse:0.84098\n",
      "[700]\ttrain-rmse:0.623182\tvalid-rmse:0.840779\n",
      "[750]\ttrain-rmse:0.619155\tvalid-rmse:0.840529\n",
      "[800]\ttrain-rmse:0.615066\tvalid-rmse:0.840152\n",
      "[850]\ttrain-rmse:0.61129\tvalid-rmse:0.839719\n",
      "[900]\ttrain-rmse:0.608119\tvalid-rmse:0.839563\n",
      "[950]\ttrain-rmse:0.605193\tvalid-rmse:0.839516\n",
      "[1000]\ttrain-rmse:0.602279\tvalid-rmse:0.839569\n",
      "[1050]\ttrain-rmse:0.599175\tvalid-rmse:0.839245\n",
      "[1100]\ttrain-rmse:0.59634\tvalid-rmse:0.839357\n",
      "[1150]\ttrain-rmse:0.593931\tvalid-rmse:0.839308\n",
      "[1200]\ttrain-rmse:0.591733\tvalid-rmse:0.839514\n",
      "Stopping. Best iteration:\n",
      "[1045]\ttrain-rmse:0.599523\tvalid-rmse:0.839214\n",
      "\n",
      "Fold 2 | rmse: 0.8392136492174501\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/local/home/ningzesun/.local/lib/python3.6/site-packages/xgboost/core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
      "  if getattr(data, 'base', None) is not None and \\\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0]\ttrain-rmse:4.10341\tvalid-rmse:4.09708\n",
      "Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.\n",
      "\n",
      "Will train until valid-rmse hasn't improved in 200 rounds.\n",
      "[50]\ttrain-rmse:0.995087\tvalid-rmse:1.03318\n",
      "[100]\ttrain-rmse:0.820024\tvalid-rmse:0.877656\n",
      "[150]\ttrain-rmse:0.764657\tvalid-rmse:0.838237\n",
      "[200]\ttrain-rmse:0.732883\tvalid-rmse:0.818822\n",
      "[250]\ttrain-rmse:0.709522\tvalid-rmse:0.805935\n",
      "[300]\ttrain-rmse:0.691759\tvalid-rmse:0.798073\n",
      "[350]\ttrain-rmse:0.678747\tvalid-rmse:0.793046\n",
      "[400]\ttrain-rmse:0.667609\tvalid-rmse:0.789206\n",
      "[450]\ttrain-rmse:0.659453\tvalid-rmse:0.786696\n",
      "[500]\ttrain-rmse:0.651977\tvalid-rmse:0.785093\n",
      "[550]\ttrain-rmse:0.645329\tvalid-rmse:0.784107\n",
      "[600]\ttrain-rmse:0.639456\tvalid-rmse:0.783294\n",
      "[650]\ttrain-rmse:0.633913\tvalid-rmse:0.782653\n",
      "[700]\ttrain-rmse:0.629134\tvalid-rmse:0.781777\n",
      "[750]\ttrain-rmse:0.624877\tvalid-rmse:0.781434\n",
      "[800]\ttrain-rmse:0.62069\tvalid-rmse:0.781379\n",
      "[850]\ttrain-rmse:0.616887\tvalid-rmse:0.781034\n",
      "[900]\ttrain-rmse:0.613657\tvalid-rmse:0.780829\n",
      "[950]\ttrain-rmse:0.610548\tvalid-rmse:0.780757\n",
      "[1000]\ttrain-rmse:0.607357\tvalid-rmse:0.780811\n",
      "[1050]\ttrain-rmse:0.6043\tvalid-rmse:0.78054\n",
      "[1100]\ttrain-rmse:0.601403\tvalid-rmse:0.78045\n",
      "[1150]\ttrain-rmse:0.598618\tvalid-rmse:0.780452\n",
      "[1200]\ttrain-rmse:0.596189\tvalid-rmse:0.780434\n",
      "[1250]\ttrain-rmse:0.59347\tvalid-rmse:0.780563\n",
      "[1300]\ttrain-rmse:0.591126\tvalid-rmse:0.78078\n",
      "Stopping. Best iteration:\n",
      "[1134]\ttrain-rmse:0.599547\tvalid-rmse:0.780333\n",
      "\n",
      "Fold 3 | rmse: 0.7803328594655591\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/local/home/ningzesun/.local/lib/python3.6/site-packages/xgboost/core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
      "  if getattr(data, 'base', None) is not None and \\\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0]\ttrain-rmse:4.0994\tvalid-rmse:4.10682\n",
      "Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.\n",
      "\n",
      "Will train until valid-rmse hasn't improved in 200 rounds.\n",
      "[50]\ttrain-rmse:1.00208\tvalid-rmse:0.994173\n",
      "[100]\ttrain-rmse:0.823693\tvalid-rmse:0.820339\n",
      "[150]\ttrain-rmse:0.770739\tvalid-rmse:0.779659\n",
      "[200]\ttrain-rmse:0.738412\tvalid-rmse:0.758209\n",
      "[250]\ttrain-rmse:0.715088\tvalid-rmse:0.744332\n",
      "[300]\ttrain-rmse:0.697997\tvalid-rmse:0.735124\n",
      "[350]\ttrain-rmse:0.685538\tvalid-rmse:0.72936\n",
      "[400]\ttrain-rmse:0.675374\tvalid-rmse:0.72589\n",
      "[450]\ttrain-rmse:0.666507\tvalid-rmse:0.723159\n",
      "[500]\ttrain-rmse:0.658197\tvalid-rmse:0.720393\n",
      "[550]\ttrain-rmse:0.650842\tvalid-rmse:0.71869\n",
      "[600]\ttrain-rmse:0.645439\tvalid-rmse:0.717804\n",
      "[650]\ttrain-rmse:0.64004\tvalid-rmse:0.71696\n",
      "[700]\ttrain-rmse:0.63489\tvalid-rmse:0.716153\n",
      "[750]\ttrain-rmse:0.630355\tvalid-rmse:0.715716\n",
      "[800]\ttrain-rmse:0.626215\tvalid-rmse:0.715525\n",
      "[850]\ttrain-rmse:0.622854\tvalid-rmse:0.715186\n",
      "[900]\ttrain-rmse:0.61959\tvalid-rmse:0.714995\n",
      "[950]\ttrain-rmse:0.616152\tvalid-rmse:0.714623\n",
      "[1000]\ttrain-rmse:0.613156\tvalid-rmse:0.714771\n",
      "[1050]\ttrain-rmse:0.610166\tvalid-rmse:0.714397\n",
      "[1100]\ttrain-rmse:0.607173\tvalid-rmse:0.714447\n",
      "[1150]\ttrain-rmse:0.604711\tvalid-rmse:0.714717\n",
      "[1200]\ttrain-rmse:0.602217\tvalid-rmse:0.71498\n",
      "[1250]\ttrain-rmse:0.599885\tvalid-rmse:0.715\n",
      "Stopping. Best iteration:\n",
      "[1057]\ttrain-rmse:0.609807\tvalid-rmse:0.714394\n",
      "\n",
      "Fold 4 | rmse: 0.7143942661171114\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/local/home/ningzesun/.local/lib/python3.6/site-packages/xgboost/core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
      "  if getattr(data, 'base', None) is not None and \\\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0]\ttrain-rmse:4.0991\tvalid-rmse:4.1157\n",
      "Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.\n",
      "\n",
      "Will train until valid-rmse hasn't improved in 200 rounds.\n",
      "[50]\ttrain-rmse:1.00419\tvalid-rmse:1.00973\n",
      "[100]\ttrain-rmse:0.82228\tvalid-rmse:0.843475\n",
      "[150]\ttrain-rmse:0.768298\tvalid-rmse:0.806672\n",
      "[200]\ttrain-rmse:0.739544\tvalid-rmse:0.790521\n",
      "[250]\ttrain-rmse:0.716653\tvalid-rmse:0.777986\n",
      "[300]\ttrain-rmse:0.699252\tvalid-rmse:0.771598\n",
      "[350]\ttrain-rmse:0.683277\tvalid-rmse:0.765087\n",
      "[400]\ttrain-rmse:0.671436\tvalid-rmse:0.761229\n",
      "[450]\ttrain-rmse:0.662362\tvalid-rmse:0.758912\n",
      "[500]\ttrain-rmse:0.65433\tvalid-rmse:0.757258\n",
      "[550]\ttrain-rmse:0.647973\tvalid-rmse:0.756688\n",
      "[600]\ttrain-rmse:0.6433\tvalid-rmse:0.75665\n",
      "[650]\ttrain-rmse:0.638119\tvalid-rmse:0.75614\n",
      "[700]\ttrain-rmse:0.633384\tvalid-rmse:0.755457\n",
      "[750]\ttrain-rmse:0.629116\tvalid-rmse:0.755572\n",
      "[800]\ttrain-rmse:0.624843\tvalid-rmse:0.755587\n",
      "[850]\ttrain-rmse:0.621311\tvalid-rmse:0.755636\n",
      "[900]\ttrain-rmse:0.617639\tvalid-rmse:0.755797\n",
      "Stopping. Best iteration:\n",
      "[708]\ttrain-rmse:0.632601\tvalid-rmse:0.755305\n",
      "\n",
      "Fold 5 | rmse: 0.7553051170770692\n",
      "\n",
      "Mean rmse = 0.76622667053251\n"
     ]
    },
    {
     "ename": "TypeError",
     "evalue": "mean_squared_error() missing 1 required positional argument: 'y_pred'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-13-d4ee3414f032>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     62\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     63\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"\\nMean rmse = {score}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 64\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Out of folds rmse = {np.sqrt(mean_squared_error((target, y_oof)))}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mTypeError\u001b[0m: mean_squared_error() missing 1 required positional argument: 'y_pred'"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold\n",
    "columns = features.columns\n",
    "kf = StratifiedKFold(n_splits=5, shuffle=False, random_state=45)\n",
    "splits=kf.split(features,train_df['month'])\n",
    "score = 0\n",
    "NFOLDS=5\n",
    "feature_importance_df = pd.DataFrame()\n",
    "out_folder_train_prediction= pd.DataFrame()\n",
    "models=[]\n",
    "y_oof = np.zeros(features.shape[0])\n",
    "for fold_n, (train_index, valid_index) in enumerate(splits):\n",
    "    dtrain = xgb.DMatrix(features.iloc[train_index],target.iloc[train_index])\n",
    "    dvalid = xgb.DMatrix(features.iloc[valid_index], target.iloc[valid_index])\n",
    "    y_valid=target.iloc[valid_index]\n",
    "\n",
    "    params = {'eval_metric': 'rmse',\\\n",
    "              'objective': 'reg:squarederror',\\\n",
    "              'booster':'gbtree',\\\n",
    "              'nthread' : 4,\\\n",
    "              'eta' : 0.05,\\\n",
    "              'max_leaves': 1800,\\\n",
    "              'max_depth' : 12,\\\n",
    "              'subsample' : 0.1,\\\n",
    "              'colsample_bytree' : 0.9,\\\n",
    "              'colsample_bylevel' : 0.9,\\\n",
    "              'gamma':0,\\\n",
    "              'max_bin':180,\\\n",
    "              'min_child_weight':3.0,\\\n",
    "              'reg_alpha':2.0,\\\n",
    "              'reg_lambda':0.1,\n",
    "              'tree_method': 'gpu_hist',\n",
    "              'n_gpus': 2}\n",
    "\n",
    "    watchlist = [(dtrain, 'train'), (dvalid, 'valid')]\n",
    "        \n",
    "    model=xgb.train(params, dtrain, 2000, watchlist, maximize=False, early_stopping_rounds = 200, verbose_eval=50)\n",
    "    \n",
    "    y_pred_valid = model.predict(dvalid, ntree_limit=model.best_ntree_limit)\n",
    "\n",
    "    \n",
    "    oof_preds=pd.DataFrame()\n",
    "    oof_preds['train_index']=valid_index\n",
    "    oof_preds['TARGET']= y_pred_valid\n",
    "    oof_preds[\"folder\"]=fold_n + 1\n",
    "    out_folder_train_prediction = pd.concat([out_folder_train_prediction, oof_preds], axis=0)\n",
    "    \n",
    "    fold_importance_df = pd.DataFrame()\n",
    "    fold_importance_df = pd.DataFrame(model.get_fscore().items(), columns=['feature','importance']).sort_values('importance', ascending=False)\n",
    "    fold_importance_df[\"fold\"] = fold_n + 1\n",
    "    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n",
    "    \n",
    "    print(f\"Fold {fold_n + 1} | rmse: {np.sqrt(mean_squared_error(y_valid, y_pred_valid))}\")\n",
    "    \n",
    "    score += np.sqrt(mean_squared_error(y_valid,y_pred_valid)) / NFOLDS\n",
    "    \n",
    "    y_oof[valid_index] = y_pred_valid\n",
    "    \n",
    "    models.append(model)\n",
    "          \n",
    "    del dtrain, dvalid, watchlist, y_valid\n",
    "    gc.collect()\n",
    "    \n",
    "print(f\"\\nMean rmse = {score}\")\n",
    "# one error here but deletion will make this work and this is not necessary\n",
    "# print(f\"Out of folds rmse = {np.sqrt(mean_squared_error((target, y_oof)))}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory usage of dataframe is 5726.29 MB\n",
      "Memory usage after optimization is: 1232.74 MB\n",
      "Decreased by 78.5%\n"
     ]
    }
   ],
   "source": [
    "test_df = pd.read_csv('../Large_output/test_1.074.csv')\n",
    "test_df = test_df[['building_id',\\\n",
    " 'site_id',\\\n",
    " 'primary_use',\\\n",
    " 'meter',\\\n",
    " 'dayofweek',\\\n",
    " 'square_feet',\\\n",
    " 'year_built',\\\n",
    " 'floor_count',\\\n",
    " 'air_temperature',\\\n",
    " 'cloud_coverage',\\\n",
    " 'dew_temperature',\\\n",
    " 'precip_depth_1_hr',\\\n",
    " 'sea_level_pressure',\\\n",
    " 'wind_direction',\\\n",
    " 'wind_speed',\\\n",
    " 'relative_humidity',\\\n",
    " 'feels_like',\\\n",
    " 'hour']]\n",
    "test_df=reduce_mem_usage(test_df,use_float16=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/local/home/ningzesun/.local/lib/python3.6/site-packages/ipykernel_launcher.py:7: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0\n",
      "Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`\n",
      "  import sys\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2817678ac50445c5890e722f50e8558a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, max=120.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "41697600\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm_notebook as tqdm\n",
    "# group the prediction into 120 to save the space\n",
    "def predictions(models, iterations = 120):\n",
    "    results = []\n",
    "    set_size = len(test_df)\n",
    "    batch_size = set_size // iterations\n",
    "    meter_reading = []\n",
    "    for i in tqdm(range(iterations)):\n",
    "        pos = i*batch_size\n",
    "        temp_df = test_df.iloc[pos : pos+batch_size]\n",
    "        dtest = xgb.DMatrix(temp_df)\n",
    "        fold_preds = [np.expm1(model.predict(dtest, ntree_limit=model.best_ntree_limit)) for model in models]\n",
    "        meter_reading.extend(np.mean(fold_preds, axis=0))\n",
    "    print(len(meter_reading))\n",
    "    assert len(meter_reading) == set_size\n",
    "    test_df['meter_reading']=np.clip(meter_reading, 0, a_max=None)\n",
    "    test_df.loc[(test_df['site_id']==0) & \n",
    "                 (test_df['meter']==0),'meter_reading']=test_df.loc[(test_df['site_id']==0) &\n",
    "                                                            (test_df['meter']==0),'meter_reading'].mul(3.4118)\n",
    "    submission = pd.read_csv('../Resources/sample_submission.csv')\n",
    "    submission['meter_reading'] = test_df['meter_reading']\n",
    "    return submission\n",
    "df_result = predictions(models)\n",
    "df_result.to_csv('../Large_output/xgb_new_csv_nomonth.csv',index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>row_id</th>\n",
       "      <th>meter_reading</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>172.321182</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>62.858246</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>6.296654</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>293.548431</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>1443.788940</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>5</td>\n",
       "      <td>9.223751</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>6</td>\n",
       "      <td>134.421921</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>7</td>\n",
       "      <td>540.996765</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>8</td>\n",
       "      <td>772.756470</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>9</td>\n",
       "      <td>403.090210</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>10</td>\n",
       "      <td>53.982338</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>11</td>\n",
       "      <td>8.299406</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>12</td>\n",
       "      <td>1382.560059</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>13</td>\n",
       "      <td>468.836823</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>14</td>\n",
       "      <td>295.426300</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>15</td>\n",
       "      <td>288.604279</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>16</td>\n",
       "      <td>38.904301</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>17</td>\n",
       "      <td>340.119629</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>18</td>\n",
       "      <td>590.603821</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>19</td>\n",
       "      <td>224.752029</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    row_id  meter_reading\n",
       "0        0     172.321182\n",
       "1        1      62.858246\n",
       "2        2       6.296654\n",
       "3        3     293.548431\n",
       "4        4    1443.788940\n",
       "5        5       9.223751\n",
       "6        6     134.421921\n",
       "7        7     540.996765\n",
       "8        8     772.756470\n",
       "9        9     403.090210\n",
       "10      10      53.982338\n",
       "11      11       8.299406\n",
       "12      12    1382.560059\n",
       "13      13     468.836823\n",
       "14      14     295.426300\n",
       "15      15     288.604279\n",
       "16      16      38.904301\n",
       "17      17     340.119629\n",
       "18      18     590.603821\n",
       "19      19     224.752029"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_result.head(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
